library(data.table)
library(ggplot2)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caret)
## Loading required package: lattice
title_url <- "https://datasets.imdbws.com/title.basics.tsv.gz"
download.file(title_url, destfile = "title.basics.tsv", method = "curl")
title <- read.delim2("title.basics.tsv")
ratings_url <- "https://datasets.imdbws.com/title.ratings.tsv.gz"
download.file(ratings_url, destfile = "title.ratings.tsv", method = "curl")
ratings <- read.delim("title.ratings.tsv")
For the project’s scope, I decided to retain only movie data, discarding all other information. Subsequently, I performed a left join operation to combine movie titles with their corresponding ratings.
title <- subset(title, titleType == "movie" )
movie <- merge(title, ratings, by = "tconst", all.x = TRUE)
movie <- na.omit(movie) # Drop possible na's
Since it is not realistic a movie has a running time over 200 minutes, I decided to omit such movies.
movie$runtimeMinutes = as.numeric(as.character(movie$runtimeMinutes))
## Warning: NAs introduced by coercion
movie$startYear = as.numeric(as.character(movie$startYear))
## Warning: NAs introduced by coercion
movie = movie[which(movie$runtimeMinutes<=200),]
movie = na.omit(movie)
head(movie)
## tconst titleType primaryTitle
## 1 tt0000009 movie Miss Jerry
## 2 tt0000147 movie The Corbett-Fitzsimmons Fight
## 3 tt0000502 movie Bohemios
## 4 tt0000574 movie The Story of the Kelly Gang
## 5 tt0000591 movie The Prodigal Son
## 9 tt0000679 movie The Fairylogue and Radio-Plays
## originalTitle isAdult startYear endYear runtimeMinutes
## 1 Miss Jerry 0 1894 \\N 45
## 2 The Corbett-Fitzsimmons Fight 0 1897 \\N 100
## 3 Bohemios 0 1905 \\N 100
## 4 The Story of the Kelly Gang 0 1906 \\N 70
## 5 L'enfant prodigue 0 1907 \\N 90
## 9 The Fairylogue and Radio-Plays 0 1908 \\N 120
## genres averageRating numVotes
## 1 Romance 5.3 204
## 2 Documentary,News,Sport 5.3 469
## 3 \\N 4.1 15
## 4 Action,Adventure,Biography 6.0 826
## 5 Drama 4.4 20
## 9 Adventure,Fantasy 5.1 68
In the code cell below, a Generalized Additive Model (GAM) is being
fitted to the movie dataset using the gam() function from
the mgcv package. The response variable in the model is
averageRating, which is being modeled as a smooth function
of the predictor variables runtimeMinutes and
startYear. The s() function specifies that
both predictors are being modeled as smooth terms.
The s() function with two predictors creates a 2D smooth
surface, which allows us to examine how averageRating
varies across different values of runtimeMinutes and
startYear. This type of model is particularly useful for
exploring nonlinear relationships between multiple variables and can
often reveal patterns that would be difficult to detect with a simpler
linear model.
movie.gam = gam(averageRating ~ s(runtimeMinutes, startYear), data = movie)
movie.grid = expand.grid(startYear = 1900:2010, runtimeMinutes = 0:200)
movie.gam.pred = as.vector(predict(movie.gam, newdata = movie.grid))
movie.gam.pred.df = data.frame(movie.grid, averageRating = movie.gam.pred)
ggplot(movie.gam.pred.df, aes(x = startYear, y = averageRating)) +
geom_point() +
scale_size_area() +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"), color = 'orange', se = FALSE) +
facet_wrap(~cut_number(runtimeMinutes, n = 10),) +
labs(title = "Movie Ratings", subtitle = "Based Year Released Grouped by Length of Movie(minutes)",
x = "Year of Movie was Released", y = "Average Movie Ratings")
In the facetted scatter plot above, we can see some clear explanation how average movie ratings changes, depending on the combinations of the runtimes and the released years. For movies whose runtime is over 120 minutes, we see that the average ratings are the most in 1960’s. On the other hand, average ratings of the short movies are the loweset for short movies.
ggplot(movie.gam.pred.df, aes(x = runtimeMinutes, y = startYear, fill = averageRating)) +
geom_raster() +
scale_fill_distiller(palette = "RdYlBu") +
coord_fixed() +
geom_contour(aes(z = averageRating))
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_ly(movie.gam.pred.df,
x = ~runtimeMinutes,
y = ~startYear,
z = ~averageRating,
type = "scatter3d",
mode = "markers",
marker = list(size = .5))
Lastly, I did the cross validation for the GAM model. In the 5 fold cross validation, the GAM model shows average RMSE of 1.29.
# Set the number of folds for cross-validation
num_folds <- 5
# Create a function to train the GAM model and calculate RMSE
train_gam <- function(data, indices) {
train_data <- data[indices, ]
model <- gam(averageRating ~ s(runtimeMinutes, startYear), data = train_data)
return(model)
}
# Create a function to evaluate the model using RMSE
eval_rmse <- function(model, data) {
predictions <- predict(model, newdata = data)
rmse <- sqrt(mean((data$averageRating - predictions)^2))
return(rmse)
}
# Perform cross-validation
set.seed(1123) # Set seed for reproducibility
folds <- createFolds(movie$averageRating, k = num_folds)
rmse_values <- numeric(num_folds)
for (i in 1:num_folds) {
train_indices <- unlist(folds[-i])
test_indices <- folds[[i]]
train_data <- movie[train_indices, ]
test_data <- movie[test_indices, ]
model <- train_gam(train_data, train_indices)
rmse_values[i] <- eval_rmse(model, test_data)
}
# Calculate mean RMSE across folds
mean_rmse <- mean(rmse_values)
cat("Mean RMSE:", mean_rmse)
## Mean RMSE: 1.297991
The relationship between movie length and average ratings depends on the year in which the movie was released. In the 1950s and 60s, longer movies tended to have higher average ratings than shorter movies. This trend was largely a resulted from the emergence of television as a competitor to the movie industry. In order to distinguish themselves from TV, movie directors made their movies longer and more spectacular. For example, Lawrence of Arabia (1962) had a running time of 216 minutes, and The Ten Commandments (1956) had a running time of 222 minutes. Both of these movies won Academy Awards.
However, this trend changed in the 1980s with the invention of videotapes. Many movie directors wanted to sell their movies in ancillary markets through videotapes, but the problem was that videotapes could not contain movies longer than 3 hours. As a result, movie makers began to reduce the running time of their movies. While there were still long and spectacular movies being created, many shorter movies were also produced. Consequently, the highest average ratings were divided between longer and shorter movies.